Biostat 212a Homework 2
Due Feb 8, 2025 @ 11:59PM
1 ISL Exercise 4.8.1 (10pts)
Solution:
2 ISL Exercise 4.8.6 (10pts)
Solution:
3 ISL Exercise 4.8.9 (10pts)
Solution:
4 ISL Exercise 4.8.13 (a)-(i) (50pts)
Solution:
library(ISLR2)
library(MASS)
data("Weekly")
# Structure of the dataset
str(Weekly)'data.frame': 1089 obs. of 9 variables:
$ Year : num 1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
$ Lag1 : num 0.816 -0.27 -2.576 3.514 0.712 ...
$ Lag2 : num 1.572 0.816 -0.27 -2.576 3.514 ...
$ Lag3 : num -3.936 1.572 0.816 -0.27 -2.576 ...
$ Lag4 : num -0.229 -3.936 1.572 0.816 -0.27 ...
$ Lag5 : num -3.484 -0.229 -3.936 1.572 0.816 ...
$ Volume : num 0.155 0.149 0.16 0.162 0.154 ...
$ Today : num -0.27 -2.576 3.514 0.712 1.178 ...
$ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...
(a)
# Numerical summary
summary(Weekly) Year Lag1 Lag2 Lag3
Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
Lag4 Lag5 Volume Today
Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
Direction
Down:484
Up :605
# Plot the Volume over time
plot(Weekly$Year, Weekly$Volume, main="Trading Volume Over Time", xlab="Year", ylab="Volume", col="blue", pch=20)# Boxplot of market return (Today) by Direction
boxplot(Today ~ Direction, data=Weekly, main="Market Return by Direction", ylab="Today’s Return", col=c("red", "green"))# Correlation matrix (excluding categorical variables)
cor(Weekly[, -9]) # Exclude the Direction column Year Lag1 Lag2 Lag3 Lag4
Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
Lag5 Volume Today
Year -0.030519101 0.84194162 -0.032459894
Lag1 -0.008183096 -0.06495131 -0.075031842
Lag2 -0.072499482 -0.08551314 0.059166717
Lag3 0.060657175 -0.06928771 -0.071243639
Lag4 -0.075675027 -0.06107462 -0.007825873
Lag5 1.000000000 -0.05851741 0.011012698
Volume -0.058517414 1.00000000 -0.033077783
Today 0.011012698 -0.03307778 1.000000000
(b)
# Fit logistic regression model
logistic_model <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly, family = binomial)
# Summary of the logistic regression model
summary(logistic_model)
Call:
glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
Volume, family = binomial, data = Weekly)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.26686 0.08593 3.106 0.0019 **
Lag1 -0.04127 0.02641 -1.563 0.1181
Lag2 0.05844 0.02686 2.175 0.0296 *
Lag3 -0.01606 0.02666 -0.602 0.5469
Lag4 -0.02779 0.02646 -1.050 0.2937
Lag5 -0.01447 0.02638 -0.549 0.5833
Volume -0.02274 0.03690 -0.616 0.5377
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1496.2 on 1088 degrees of freedom
Residual deviance: 1486.4 on 1082 degrees of freedom
AIC: 1500.4
Number of Fisher Scoring iterations: 4
\(log(\frac{P(Up)}{1-P(Up)})\) = 0.26686−0.04127 × Lag1+0.05844 × Lag2−0.01066 × Lag3−0.02779 × Lag4−0.01447 × Lag5−0.02274 × Volume.
Yes, the p-values of Lag1 and Lag3 are less than 0.05, so they are statistically significant.
(c)
# Predict probabilities
pred_probs <- predict(logistic_model, type="response")
# Convert probabilities to class predictions (threshold = 0.5)
pred_classes <- ifelse(pred_probs > 0.5, "Up", "Down")
# Create confusion matrix
conf_matrix <- table(Predicted = pred_classes, Actual = Weekly$Direction)
# Compute accuracy
accuracy <- mean(pred_classes == Weekly$Direction)
# Print results
print(conf_matrix) Actual
Predicted Down Up
Down 54 48
Up 430 557
print(paste("Overall accuracy:", round(accuracy, 4)))[1] "Overall accuracy: 0.5611"
True Positives (TP) = 557 ; False Positives (FP) = 430 ; True Negatives (TN) = 54 ; False Negatives (FN) = 48
Accuracy= \(\frac{TP+TN}{Total Samples} = \frac{557+54}{54+48+430+557}=0.5611\) . This is only slightly better than random guessing (50%).
The model is biased towards predicting “Up”, as indicated by the large number of false positives (FP = 430). The model fails to predict “Down” accurately, with only 54 correct “Down” predictions out of 484 actual “Down” instances.
(d)
# Split the dataset
train <- Weekly$Year < 2009
train_data <- Weekly[train, ]
test_data <- Weekly[!train, ]
# Fit logistic regression using Lag2
logistic_model_lag2 <- glm(Direction ~ Lag2, data=train_data, family=binomial)
# Predict on test data
test_probs <- predict(logistic_model_lag2, newdata=test_data, type="response")
# Convert probabilities to class labels
test_preds <- ifelse(test_probs > 0.5, "Up", "Down")
# Compute confusion matrix
conf_matrix_test <- table(Predicted = test_preds, Actual = test_data$Direction)
# Compute accuracy
test_accuracy <- mean(test_preds == test_data$Direction)
# Print results
print(conf_matrix_test) Actual
Predicted Down Up
Down 9 5
Up 34 56
print(paste("Test accuracy:", round(test_accuracy, 4)))[1] "Test accuracy: 0.625"
(e)
# Fit LDA model
lda_model <- lda(Direction ~ Lag2, data=train_data)
# Predict on test data
lda_preds <- predict(lda_model, newdata=test_data)
# Extract class predictions
lda_classes <- lda_preds$class
# Create confusion matrix
conf_matrix_lda <- table(Predicted = lda_classes, Actual = test_data$Direction)
# Compute accuracy
lda_accuracy <- mean(lda_classes == test_data$Direction)
# Print results
print(conf_matrix_lda) Actual
Predicted Down Up
Down 9 5
Up 34 56
print(paste("LDA test accuracy:", round(lda_accuracy, 4)))[1] "LDA test accuracy: 0.625"
(f)
# Fit QDA model
qda_model <- qda(Direction ~ Lag2, data=train_data)
# Predict on test data
qda_preds <- predict(qda_model, newdata=test_data)
# Extract class predictions
qda_classes <- qda_preds$class
# Compute confusion matrix
conf_matrix_qda <- table(Predicted = qda_classes, Actual = test_data$Direction)
# Compute accuracy
qda_accuracy <- mean(qda_classes == test_data$Direction)
# Print results
print(conf_matrix_qda) Actual
Predicted Down Up
Down 0 0
Up 43 61
print(paste("QDA test accuracy:", round(qda_accuracy, 4)))[1] "QDA test accuracy: 0.5865"
(g)
library(class)
# Prepare training and test data
train_X <- train_data$Lag2
test_X <- test_data$Lag2
train_Y <- train_data$Direction
test_Y <- test_data$Direction
# Apply KNN with K=1
knn_preds <- knn(train = matrix(train_X), test = matrix(test_X),
cl = train_Y, k = 1)
# Compute confusion matrix
conf_matrix_knn <- table(Predicted = knn_preds, Actual = test_Y)
# Compute accuracy
knn_accuracy <- mean(knn_preds == test_Y)
# Print results
print(conf_matrix_knn) Actual
Predicted Down Up
Down 21 30
Up 22 31
print(paste("KNN (K=1) test accuracy:", round(knn_accuracy, 4)))[1] "KNN (K=1) test accuracy: 0.5"
(h)
library(e1071)
# Fit Naive Bayes model
nb_model <- naiveBayes(Direction ~ Lag2, data=train_data)
# Predict on test data
nb_preds <- predict(nb_model, newdata=test_data)
# Compute confusion matrix
conf_matrix_nb <- table(Predicted = nb_preds, Actual = test_data$Direction)
# Compute accuracy
nb_accuracy <- mean(nb_preds == test_data$Direction)
# Print results
print(conf_matrix_nb) Actual
Predicted Down Up
Down 0 0
Up 43 61
print(paste("Naive Bayes test accuracy:", round(nb_accuracy, 4)))[1] "Naive Bayes test accuracy: 0.5865"
(i)
# Create a comparison table
model_comparison <- data.frame(
Model = c("Logistic Regression", "LDA", "QDA", "KNN (K=1)",
"Naive Bayes"),
Accuracy = c(test_accuracy, lda_accuracy, qda_accuracy,
knn_accuracy, nb_accuracy)
)
# Print comparison results
print(model_comparison) Model Accuracy
1 Logistic Regression 0.6250000
2 LDA 0.6250000
3 QDA 0.5865385
4 KNN (K=1) 0.5000000
5 Naive Bayes 0.5865385
The Logistic Regression and LDA appear to have the best results on this data, and they both have 0.625 accuracy.
5 Bonus question: ISL Exercise 4.8.13 Part (j) (30pts)
Solution:
(j) Logistic Regression with multiple predictors
logistic_model_extended <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data=train_data, family=binomial)
# Predictions
test_probs_extended <- predict(logistic_model_extended, newdata=test_data, type="response")
test_preds_extended <- ifelse(test_probs_extended > 0.5, "Up", "Down")
# Confusion Matrix
conf_matrix_logistic_extended <- table(Predicted = test_preds_extended, Actual = test_data$Direction)
logistic_accuracy_extended <- mean(test_preds_extended == test_data$Direction)
print(conf_matrix_logistic_extended) Actual
Predicted Down Up
Down 31 44
Up 12 17
print(paste("Extended Logistic Regression Accuracy:", round(logistic_accuracy_extended, 4)))[1] "Extended Logistic Regression Accuracy: 0.4615"
Logistic Regression with interaction terms
logistic_model_interaction <- glm(Direction ~ Lag2 * Volume,
data=train_data, family=binomial)
# Predictions
test_probs_interaction <- predict(logistic_model_interaction,
newdata=test_data, type="response")
test_preds_interaction <- ifelse(test_probs_interaction > 0.5, "Up", "Down")
# Confusion Matrix
conf_matrix_logistic_interaction <- table(Predicted = test_preds_interaction,
Actual = test_data$Direction)
logistic_accuracy_interaction <- mean(
test_preds_interaction == test_data$Direction
)
print(conf_matrix_logistic_interaction) Actual
Predicted Down Up
Down 20 25
Up 23 36
print(paste("Logistic Regression with Interaction Accuracy:",
round(logistic_accuracy_interaction, 4)))[1] "Logistic Regression with Interaction Accuracy: 0.5385"
LDA with More Predictors
library(MASS)
lda_model_extended <- lda(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data=train_data)
# Predictions
lda_preds_extended <- predict(lda_model_extended, newdata=test_data)$class
# Confusion Matrix
conf_matrix_lda_extended <- table(Predicted = lda_preds_extended, Actual =
test_data$Direction)
lda_accuracy_extended <- mean(lda_preds_extended == test_data$Direction)
print(conf_matrix_lda_extended) Actual
Predicted Down Up
Down 31 44
Up 12 17
print(paste("Extended LDA Accuracy:", round(lda_accuracy_extended, 4)))[1] "Extended LDA Accuracy: 0.4615"
QDA with More Predictors
qda_model_extended <- qda(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data=train_data)
# Predictions
qda_preds_extended <- predict(qda_model_extended, newdata=test_data)$class
# Confusion Matrix
conf_matrix_qda_extended <- table(Predicted = qda_preds_extended, Actual =
test_data$Direction)
qda_accuracy_extended <- mean(qda_preds_extended == test_data$Direction)
print(conf_matrix_qda_extended) Actual
Predicted Down Up
Down 33 49
Up 10 12
print(paste("Extended QDA Accuracy:", round(qda_accuracy_extended, 4)))[1] "Extended QDA Accuracy: 0.4327"
Tuning K for KNN
library(class)
# Function to evaluate KNN for different K values
knn_evaluate <- function(k) {
knn_preds <- knn(train=as.matrix(train_data[, c("Lag2")]),
test=as.matrix(test_data[, c("Lag2")]),
cl=train_data$Direction, k=k)
conf_matrix_knn <- table(Predicted = knn_preds, Actual = test_data$Direction)
knn_accuracy <- mean(knn_preds == test_data$Direction)
return(list(conf_matrix=conf_matrix_knn, accuracy=knn_accuracy))
}
# Experiment with different values of K
knn_results <- lapply(c(1, 3, 5, 7, 10, 15, 20), knn_evaluate)
# Print results for each K
for (i in 1:length(knn_results)) {
print(paste("KNN with K =", c(1, 3, 5, 7, 10, 15, 20)[i]))
print(knn_results[[i]]$conf_matrix)
print(paste("Accuracy:", round(knn_results[[i]]$accuracy, 4)))
}[1] "KNN with K = 1"
Actual
Predicted Down Up
Down 21 30
Up 22 31
[1] "Accuracy: 0.5"
[1] "KNN with K = 3"
Actual
Predicted Down Up
Down 16 19
Up 27 42
[1] "Accuracy: 0.5577"
[1] "KNN with K = 5"
Actual
Predicted Down Up
Down 15 22
Up 28 39
[1] "Accuracy: 0.5192"
[1] "KNN with K = 7"
Actual
Predicted Down Up
Down 16 19
Up 27 42
[1] "Accuracy: 0.5577"
[1] "KNN with K = 10"
Actual
Predicted Down Up
Down 17 22
Up 26 39
[1] "Accuracy: 0.5385"
[1] "KNN with K = 15"
Actual
Predicted Down Up
Down 20 20
Up 23 41
[1] "Accuracy: 0.5865"
[1] "KNN with K = 20"
Actual
Predicted Down Up
Down 20 21
Up 23 40
[1] "Accuracy: 0.5769"
Naive Bayes with More Predictors
library(e1071)
nb_model_extended <- naiveBayes(Direction ~ Lag1 + Lag2 +
Lag3 + Lag4 + Lag5 + Volume, data=train_data)
# Predictions
nb_preds_extended <- predict(nb_model_extended, newdata=test_data)
# Confusion Matrix
conf_matrix_nb_extended <- table(Predicted = nb_preds_extended, Actual =
test_data$Direction)
nb_accuracy_extended <- mean(nb_preds_extended == test_data$Direction)
print(conf_matrix_nb_extended) Actual
Predicted Down Up
Down 42 56
Up 1 5
print(paste("Extended Naive Bayes Accuracy:", round(nb_accuracy_extended, 4)))[1] "Extended Naive Bayes Accuracy: 0.4519"
Comparing All Models
# Create a comparison table
model_comparison <- data.frame(
Model = c("Logistic Regression", "logistic_model_extended",
"Logistic Regression (Interaction)",
"LDA", "LDA (Extended)", "QDA", "QDA (Extended)",
"KNN (K=1)","KNN (K=3)", "KNN (K=5)", "KNN (K=7)",
"KNN (K=10)", "KNN (K=15)", "KNN (K=20)",
"Naive Bayes", "Naive Bayes (Extended)"),
Accuracy = c(test_accuracy, logistic_accuracy_extended,
logistic_accuracy_interaction,
lda_accuracy, lda_accuracy_extended,
qda_accuracy, qda_accuracy_extended,
knn_results[[1]]$accuracy, knn_results[[2]]$accuracy,
knn_results[[3]]$accuracy, knn_results[[4]]$accuracy,
knn_results[[5]]$accuracy, knn_results[[6]]$accuracy,
knn_results[[7]]$accuracy,
nb_accuracy, nb_accuracy_extended)
)
# Print comparison results
print(model_comparison) Model Accuracy
1 Logistic Regression 0.6250000
2 logistic_model_extended 0.4615385
3 Logistic Regression (Interaction) 0.5384615
4 LDA 0.6250000
5 LDA (Extended) 0.4615385
6 QDA 0.5865385
7 QDA (Extended) 0.4326923
8 KNN (K=1) 0.5000000
9 KNN (K=3) 0.5576923
10 KNN (K=5) 0.5192308
11 KNN (K=7) 0.5576923
12 KNN (K=10) 0.5384615
13 KNN (K=15) 0.5865385
14 KNN (K=20) 0.5769231
15 Naive Bayes 0.5865385
16 Naive Bayes (Extended) 0.4519231
6 Bonus question: ISL Exercise 4.8.4 (30pts)
Solution:
(a)
(b)
(c)
(d)
(e)